We have chosen this topic to dwell on the rising concern of global warming and gain useful insights from the data that is available in the real world. We as data scientists, are trying to bring a change by predicting future temperature changes and making others aware of consequences they'll need to face.
MajorCities
[CityID] INTEGER NOT NULL PRIMARY KEY,
[CityName] TEXT NOT NULL,
[Latitude] TEXT NOT NULL,
[Longitude] TEXT NOT NULL)
Country
[CountryID] INTEGER NOT NULL PRIMARY KEY,
[Country] TEXT NOT NULL)
#Importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as pp
import sqlite3
from sqlite3 import Error
from datetime import datetime
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima_model import ARIMA, ARMAResults
from sklearn.metrics import mean_squared_error
import ipywidgets as widgets
#Defining required functions for Data Manipulation
def create_connection(db_file, delete_db=False):
import os
if delete_db and os.path.exists(db_file):
os.remove(db_file)
conn = None
try:
conn = sqlite3.connect(db_file)
conn.execute("PRAGMA foreign_keys = 1")
except Error as e:
print(e)
return conn
def create_table(conn, create_table_sql, drop_table_name=None):
if drop_table_name: # You can optionally pass drop_table_name to drop the table.
try:
c = conn.cursor()
c.execute("""DROP TABLE IF EXISTS %s""" % (drop_table_name))
except Error as e:
print(e)
try:
c = conn.cursor()
c.execute(create_table_sql)
except Error as e:
print(e)
def execute_sql_statement(sql_statement, conn):
cur = conn.cursor()
cur.execute(sql_statement)
rows = cur.fetchall()
return rows
#Getting Major Cities Data
db_file_name = 'normalized_Temperature.db'
conn = create_connection(db_file_name)
cur = conn.cursor()
header = None
Cities = []
Countries = []
MCities = []
with open('GlobalLandTemperaturesByMajorCity.csv', 'r') as file:
count = 1
for line in file:
if not line.strip(): # used for skipping empty lines!
continue
if header is None:
header = line.strip().split(',')[0:7]
continue
temp_lst = line.strip().split(',')
Cities.append(temp_lst)
if temp_lst[4] not in Countries:
Countries.append(temp_lst[4])
if (temp_lst[3], temp_lst[5], temp_lst[6]) not in MCities:
MCities.append((temp_lst[3], temp_lst[5], temp_lst[6]))
Cities.sort()
# Creating Major City Table
sql_query = '''CREATE TABLE [MajorCities](
[CityID] INTEGER NOT NULL PRIMARY KEY,
[CityName] TEXT NOT NULL,
[Latitude] TEXT NOT NULL,
[Longitude] TEXT NOT NULL)
'''
create_table(conn, sql_query)
table [MajorCities] already exists
# Inserting into Major City Table
with conn:
cur = conn.cursor()
cur.executemany('''INSERT INTO MajorCities(CityName, Latitude, Longitude)
VALUES (?,?,?)''',MCities )
# Creating Country Table
sql_query = '''CREATE TABLE [Country](
[CountryID] INTEGER NOT NULL PRIMARY KEY,
[Country] TEXT NOT NULL)
'''
create_table(conn, sql_query)
Country_list = []
for c in Countries:
Country_list.append((c, ))
Country_list.sort()
table [Country] already exists
# Inserting into Country Table
with conn:
cur = conn.cursor()
cur.executemany('''INSERT INTO Country(Country)
VALUES (?)''',Country_list )
# Creating Major Cities dictionary
sql_statement = """ SELECT CityID, CityName FROM MajorCities; """
cityrows = execute_sql_statement(sql_statement, conn)
city_dict = {}
for r in cityrows:
city_dict[r[1]] = r[0]
# Creating dictionary mapping country to country ID
sql_statement = """ SELECT * FROM Country; """
countryrows = execute_sql_statement(sql_statement, conn)
country_dict = {}
for r in countryrows:
country_dict[r[1]] = r[0]
# Creating GlobalLandTemperatureByMajorCity Table
query = '''CREATE TABLE [GlobalLandTemperatureByMajorCity](
[ID] INTEGER NOT NULL PRIMARY KEY,
[Date] VARCHAR NOT NULL,
[CityID] INTEGER NOT NULL,
[CountryID] INTEGER NOT NULL,
[AverageTemperature] REAL,
[AverageTemperatureUncertainity] REAL,
FOREIGN KEY(CityID) REFERENCES MajorCities(CityID),
FOREIGN KEY(CountryID) REFERENCES Country(CountryID)
)
'''
create_table(conn, query)
majorcityrecords = []
for i in range(len(Cities)):
if int(Cities[i][0][0:4]) < 1900:
continue
if Cities[i][1] == '' or Cities[i][2] == '':
continue
majorcityrecords.append((datetime.strptime(Cities[i][0], '%Y-%m-%d').strftime('%Y-%m-%d'), city_dict.get(Cities[i][3]), country_dict.get(Cities[i][4]), round(float(Cities[i][1]), 2), round(float(Cities[i][2]),2)))
# Inserting into GlobalLandTemperatureByMajorCity Table
with conn:
cur.executemany('''INSERT INTO GlobalLandTemperatureByMajorCity(Date, CityID, CountryID, AverageTemperature, AverageTemperatureUncertainity)
VALUES (?,?,?,?,?)''', majorcityrecords)
table [GlobalLandTemperatureByMajorCity] already exists
# Creating GlobalLandTemperatureByCountry table
header = None
Countries = []
with open('GlobalLandTemperaturesByCountry.csv', 'r') as file:
for line in file:
if not line.strip(): # used for skipping empty lines!
continue
if header is None:
header = line.strip().split(',')
continue
Countries.append(line.strip().split(','))
query = '''CREATE TABLE [GlobalLandTemperatureByCountry](
[ID] INTEGER NOT NULL PRIMARY KEY,
[Date] VARCHAR NOT NULL,
`[Country] TEXT NOT NULL,`
[AverageTemperature] REAL,
[AverageTemperatureUncertainity] REAL)
'''
create_table(conn, query)
countryrecords = []
for i in range(len(Countries)):
if int(Countries[i][0][0:4]) < 1900:
continue
if Countries[i][1] == '' or Countries[i][2] == '':
continue
countryrecords.append((datetime.strptime(Countries[i][0], '%Y-%m-%d').strftime('%Y-%m-%d'), Countries[i][3], round(float(Countries[i][1]), 2), round(float(Countries[i][2]), 2)))
with conn:
cur.executemany('''INSERT INTO GlobalLandTemperatureByCountry(Date, Country, AverageTemperature, AverageTemperatureUncertainity)
VALUES (?,?,?,?)''', countryrecords)
table [GlobalLandTemperatureByCountry] already exists
# Creating GlobalLandTemperatures table
header = None
GlobalTemperatures = []
with open('GlobalTemperatures.csv', 'r') as file:
for line in file:
if not line.strip(): # used for skipping empty lines!
continue
if header is None:
header = line.strip().split(',')[0:3]
continue
GlobalTemperatures.append(line.strip().split(',')[0:3])
query = '''CREATE TABLE [GlobalTemperatures](
[ID] INTEGER NOT NULL PRIMARY KEY,
[Date] VARCHAR NOT NULL,
[AverageTemperature] REAL,
[AverageTemperatureUncertainity] REAL)
'''
create_table(conn, query)
globalrecords = []
for i in range(len(GlobalTemperatures)):
if int(GlobalTemperatures[i][0][0:4]) < 1900:
continue
if GlobalTemperatures[i][1] == '' or GlobalTemperatures[i][2] == '':
continue
globalrecords.append((datetime.strptime(GlobalTemperatures[i][0], '%Y-%m-%d').strftime('%Y-%m-%d'), round(float(GlobalTemperatures[i][1]), 2), round(float(GlobalTemperatures[i][2]), 2)))
with conn:
cur.executemany('''INSERT INTO GlobalTemperatures(Date, AverageTemperature, AverageTemperatureUncertainity)
VALUES (?,?,?)''', globalrecords)
table [GlobalTemperatures] already exists
# Printing Table MajorCities
sql_statement = """SELECT * FROM MajorCities"""
df = pd.read_sql_query(sql_statement, conn)
df
| CityID | CityName | Latitude | Longitude | |
|---|---|---|---|---|
| 0 | 1 | Abidjan | 5.63N | 3.23W |
| 1 | 2 | Addis Abeba | 8.84N | 38.11E |
| 2 | 3 | Ahmadabad | 23.31N | 72.52E |
| 3 | 4 | Aleppo | 36.17N | 37.79E |
| 4 | 5 | Alexandria | 31.35N | 30.16E |
| ... | ... | ... | ... | ... |
| 395 | 396 | Tokyo | 36.17N | 139.23E |
| 396 | 397 | Toronto | 44.20N | 80.50W |
| 397 | 398 | Umm Durman | 15.27N | 32.50E |
| 398 | 399 | Wuhan | 29.74N | 114.46E |
| 399 | 400 | Xian | 34.56N | 108.97E |
400 rows × 4 columns
# Printing table Country
sql_statement = """SELECT * FROM Country"""
df = pd.read_sql_query(sql_statement, conn)
df
| CountryID | Country | |
|---|---|---|
| 0 | 1 | Afghanistan |
| 1 | 2 | Angola |
| 2 | 3 | Australia |
| 3 | 4 | Bangladesh |
| 4 | 5 | Brazil |
| ... | ... | ... |
| 142 | 143 | Ukraine |
| 143 | 144 | United Kingdom |
| 144 | 145 | United States |
| 145 | 146 | Vietnam |
| 146 | 147 | Zimbabwe |
147 rows × 2 columns
# Printing Table GlobalLandTemperatureByMajorCity
sql_statement = """SELECT * FROM GlobalLandTemperatureByMajorCity"""
df = pd.read_sql_query(sql_statement, conn)
df
| ID | Date | CityID | CountryID | AverageTemperature | AverageTemperatureUncertainity | |
|---|---|---|---|---|---|---|
| 0 | 1 | 1900-01-01 | 96 | 23 | -0.57 | 0.43 |
| 1 | 2 | 1900-01-01 | 45 | 1 | -1.22 | 1.53 |
| 2 | 3 | 1900-01-01 | 65 | 7 | -10.53 | 0.60 |
| 3 | 4 | 1900-01-01 | 93 | 9 | -11.57 | 1.32 |
| 4 | 5 | 1900-01-01 | 66 | 31 | -12.71 | 0.85 |
| ... | ... | ... | ... | ... | ... | ... |
| 409216 | 409217 | 2013-09-01 | 372 | 145 | 17.41 | 1.05 |
| 409217 | 409218 | 2013-09-01 | 363 | 123 | 18.31 | 1.23 |
| 409218 | 409219 | 2013-09-01 | 322 | 145 | 19.98 | 1.03 |
| 409219 | 409220 | 2013-09-01 | 356 | 145 | 23.30 | 0.98 |
| 409220 | 409221 | 2013-09-01 | 384 | 111 | 28.32 | 1.13 |
409221 rows × 6 columns
# Printing the entire data using
sql_statement = """SELECT ID, Date, CityName, Country, AverageTemperature, AverageTemperatureUncertainity, Latitude, Longitude FROM GlobalLandTemperatureByMajorCity glt
INNER JOIN Country ON glt.CountryID = Country.CountryID
INNER JOIN MajorCities ON glt.CityID = MajorCities.CityID
WHERE MajorCities.CityName IN ('Bangalore', 'Bangkok', 'Paris', 'Harbin', 'Montreal', 'Moscow', 'Kiev', 'Toronto',
'Saint Petersburg', 'Tokyo', 'Berlin', 'Istanbul', 'Karachi', 'Dhaka', 'Rome', 'NewYork', 'Durban',
'Kano', 'Baghdad', 'Melbourne', 'Madrid', 'London', 'Berlin', 'Taiyuan', 'Florida')
"""
df_data = pd.read_sql_query(sql_statement, conn)
df_data
| ID | Date | CityName | Country | AverageTemperature | AverageTemperatureUncertainity | Latitude | Longitude | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1900-01-01 | Tokyo | Japan | -0.57 | 0.43 | 36.17N | 139.23E |
| 1 | 3 | 1900-01-01 | Montreal | Canada | -10.53 | 0.60 | 45.81N | 72.69W |
| 2 | 4 | 1900-01-01 | Taiyuan | China | -11.57 | 1.32 | 37.78N | 111.86E |
| 3 | 5 | 1900-01-01 | Moscow | Russia | -12.71 | 0.85 | 55.45N | 36.85E |
| 4 | 10 | 1900-01-01 | Harbin | China | -20.75 | 1.34 | 45.81N | 125.77E |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 90025 | 409197 | 2013-08-01 | Dhaka | Bangladesh | 29.15 | 0.50 | 23.31N | 90.00E |
| 90026 | 409201 | 2013-08-01 | Karachi | Pakistan | 29.54 | 0.53 | 24.92N | 67.39E |
| 90027 | 409213 | 2013-08-01 | Baghdad | Iraq | 35.46 | 0.61 | 32.95N | 45.00E |
| 90028 | 409215 | 2013-09-01 | Montreal | Canada | 14.28 | 1.11 | 45.81N | 72.69W |
| 90029 | 409216 | 2013-09-01 | Toronto | Canada | 14.60 | 1.27 | 44.20N | 80.50W |
90030 rows × 8 columns
missing_values_count = df_data.isnull().sum()
missing_values_count
#No null values present as we already dropped them.
ID 0 Date 0 CityName 0 Country 0 AverageTemperature 0 AverageTemperatureUncertainity 0 Latitude 0 Longitude 0 dtype: int64
# Checking trends in Montreal over time
Montreal= df_data[['AverageTemperature', 'Date']].loc[df_data.CityName == 'Montreal']
sns.lineplot(y = Montreal.AverageTemperature ,x = Montreal.Date)
<AxesSubplot: xlabel='Date', ylabel='AverageTemperature'>
# Checking trends in Berlin over time
Berlin= df_data[['AverageTemperature', 'Date']].loc[df_data.CityName == 'Berlin']
sns.lineplot(y = Berlin.AverageTemperature ,x = Berlin.Date)
<AxesSubplot: xlabel='Date', ylabel='AverageTemperature'>
# Added month and year to the data
sql_statement = """SELECT ID, CAST(strftime('%Y',Date) AS INT) AS Year,CAST(strftime('%m',Date) AS INT) AS Month, CityName, Country, AverageTemperature, AverageTemperatureUncertainity, Latitude, Longitude FROM GlobalLandTemperatureByMajorCity glt
INNER JOIN Country ON glt.CountryID = Country.CountryID
INNER JOIN MajorCities ON glt.CityID = MajorCities.CityID
WHERE MajorCities.CityName IN ('Bangalore', 'Bangkok', 'Paris', 'Harbin', 'Montreal', 'Moscow', 'Kiev', 'Toronto',
'Saint Petersburg', 'Tokyo', 'Berlin', 'Istanbul', 'Karachi', 'Dhaka', 'Rome', 'NewYork', 'Durban',
'Kano', 'Baghdad', 'Melbourne', 'Madrid', 'London', 'Berlin', 'Taiyuan', 'Florida')
"""
df_data2 = pd.read_sql_query(sql_statement, conn)
df_data2.Month.unique()
array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
# Average temperature of all cities over time.
all_cities= df_data2[['AverageTemperature', 'Year']].loc[df_data2.Year > 1980]
sns.lineplot(y = all_cities.AverageTemperature ,x = all_cities.Year)
<AxesSubplot: xlabel='Year', ylabel='AverageTemperature'>
# Grouped dataframe by year
sql_statement = """SELECT ID, CAST(strftime('%Y',Date) AS INT) AS Year, CityName, Country, AverageTemperature, AverageTemperatureUncertainity, Latitude, Longitude FROM GlobalLandTemperatureByMajorCity glt
INNER JOIN Country ON glt.CountryID = Country.CountryID
INNER JOIN MajorCities ON glt.CityID = MajorCities.CityID
WHERE MajorCities.CityName IN ('Bangalore', 'Bangkok', 'Paris', 'Harbin', 'Montreal', 'Moscow', 'Kiev', 'Toronto',
'Saint Petersburg', 'Tokyo', 'Berlin', 'Istanbul', 'Karachi', 'Dhaka', 'Rome', 'NewYork', 'Durban',
'Kano', 'Baghdad', 'Melbourne', 'Madrid', 'London', 'Berlin', 'Taiyuan', 'Florida') GROUP BY year
"""
df_data_by_year = pd.read_sql_query(sql_statement, conn)
df_data_by_year
| ID | Year | CityName | Country | AverageTemperature | AverageTemperatureUncertainity | Latitude | Longitude | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1900 | Tokyo | Japan | -0.57 | 0.43 | 36.17N | 139.23E |
| 1 | 1203 | 1901 | Montreal | Canada | -12.34 | 0.31 | 45.81N | 72.69W |
| 2 | 2404 | 1902 | Kiev | Ukraine | -1.28 | 1.03 | 50.63N | 31.69E |
| 3 | 3605 | 1903 | Montreal | Canada | -11.73 | 0.87 | 45.81N | 72.69W |
| 4 | 4802 | 1904 | Tokyo | Japan | -0.34 | 0.71 | 36.17N | 139.23E |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 109 | 130803 | 2009 | Berlin | Germany | -1.94 | 0.28 | 52.24N | 13.14E |
| 110 | 132005 | 2010 | Saint Petersburg | Russia | -13.44 | 0.36 | 60.27N | 29.19E |
| 111 | 133203 | 2011 | Montreal | Canada | -10.70 | 0.46 | 45.81N | 72.69W |
| 112 | 134406 | 2012 | Harbin | China | -19.80 | 0.48 | 45.81N | 125.77E |
| 113 | 135604 | 2013 | Montreal | Canada | -10.04 | 0.32 | 45.81N | 72.69W |
114 rows × 8 columns
# Change in temperature of Montreal over the years
Montreal= df_data_by_year[['AverageTemperature', 'Year']].loc[df_data_by_year.CityName == 'Montreal']
sns.lineplot(y = Montreal.AverageTemperature ,x = Montreal.Year)
<AxesSubplot: xlabel='Year', ylabel='AverageTemperature'>
# Change in temperature of Berlin over the years
Berlin = df_data_by_year[['AverageTemperature', 'Year']].loc[df_data_by_year.CityName == 'Berlin']
sns.lineplot(y = Berlin.AverageTemperature ,x = Berlin.Year)
<AxesSubplot: xlabel='Year', ylabel='AverageTemperature'>
# Change in temperature per month over years in montreal
Montreal= df_data2[['AverageTemperature', 'Year', 'Month']].loc[df_data2.CityName == 'Montreal']
sns.lineplot(data=Montreal,
x='Year',
y='AverageTemperature',
hue='Month',
legend='full')
<AxesSubplot: xlabel='Year', ylabel='AverageTemperature'>
# Change in temperature in Berlin per month over the years
Berlin = df_data2[['AverageTemperature', 'Year', 'Month']].loc[df_data2.CityName == 'Berlin']
sns.lineplot(data=Berlin,
x='Year',
y='AverageTemperature',
hue='Month',
legend='full')
<AxesSubplot: xlabel='Year', ylabel='AverageTemperature'>
ARIMA: Autoregressive Integrated Moving Average
Stationary time series is a series where the mean, variance of the time series is constant. To check for stationarity we perform Augmented Dickey Fuller Test.
# Data with top 20-25 cities
df_majorcities_query = '''Select Date, MajorCities.CityName, Country.Country, AverageTemperature,
AverageTemperatureUncertainity
FROM GlobalLandTemperatureByMajorCity
INNER JOIN MajorCities ON GlobalLandTemperatureByMajorCity .CityID = MajorCities.CityID
INNER JOIN Country ON GlobalLandTemperatureByMajorCity.CountryID=Country.CountryID
WHERE MajorCities.CityName in ('Bangkok', 'Paris', 'Montreal', 'Moscow', 'Kiev', 'Toronto',
'Saint Petersburg', 'Tokyo', 'Berlin', 'Istanbul', 'Dhaka', 'Rome',
'Kano', 'Baghdad', 'Melbourne', 'Madrid', 'London', 'Berlin', 'Taiyuan', 'Bangalore', 'Harbin', 'Karachi', 'Durban')
GROUP BY Date, MajorCities.CityName, Country.Country
ORDER BY MajorCities.CityName, Country.Country'''
df_majorcities = pd.read_sql_query(df_majorcities_query, conn)
Top20Cities = ['Bangkok', 'Paris', 'Montreal', 'Moscow', 'Kiev', 'Toronto',
'Saint Petersburg', 'Tokyo', 'Berlin', 'Istanbul', 'Dhaka', 'Rome',
'Kano', 'Baghdad', 'Melbourne', 'Madrid', 'London', 'Berlin', 'Taiyuan', 'Bangalore']
# Function to check Stationarity of the data.
import warnings
def checkStationarity(data):
data.index = data['Date']
pp.plot(data.index, data['AverageTemperature'])
pp.title("Average Temperature from 1900 to 2013")
pp.show()
warnings.filterwarnings("ignore")
from statsmodels.tsa.stattools import adfuller
Temps = data['AverageTemperature'].values
split = len(Temps)//2
Temps1, Temps2 = Temps[0:split], Temps[split:]
meanTemps1, meanTemps2 = Temps1.mean(), Temps2.mean()
varTemps1, varTemps2 = Temps1.var(), Temps2.var()
if abs(meanTemps1-meanTemps2) <= 10 and abs(varTemps1-varTemps2) <= 10:
print('This indicates the given timeseries might be stationary as the mean and variance does not differ much.')
else:
print('Given timeseries might not be stationary.')
# Performing Augmented Dickey-Fuller Test to confirm stationarity
AdfullerResult = adfuller(Temps)
print(AdfullerResult[1])
p_value = AdfullerResult[1]
if p_value < 0.05:
return 'Time series is stationary'
else:
return 'Time series is not stationary'
# Checking stationarity of different cities
for c in Top20Cities:
filter = df_majorcities.CityName == c
city = df_majorcities.where(filter)
print('For ' + c + ' : ' + checkStationarity(city.dropna()))
This indicates the given timeseries might be stationary as the mean and variance does not differ much. 0.008984691858070224 For Bangkok : Time series is stationary
This indicates the given timeseries might be stationary as the mean and variance does not differ much. 0.001579499393533814 For Paris : Time series is stationary
This indicates the given timeseries might be stationary as the mean and variance does not differ much. 0.001260308867423435 For Montreal : Time series is stationary
This indicates the given timeseries might be stationary as the mean and variance does not differ much. 3.618027447222019e-05 For Moscow : Time series is stationary
This indicates the given timeseries might be stationary as the mean and variance does not differ much. 0.00012653816771237818 For Kiev : Time series is stationary
This indicates the given timeseries might be stationary as the mean and variance does not differ much. 0.00048465407590177266 For Toronto : Time series is stationary
This indicates the given timeseries might be stationary as the mean and variance does not differ much. 1.7333211705242409e-06 For Saint Petersburg : Time series is stationary
This indicates the given timeseries might be stationary as the mean and variance does not differ much. 0.003762737503139213 For Tokyo : Time series is stationary
This indicates the given timeseries might be stationary as the mean and variance does not differ much. 0.00012864866064311878 For Berlin : Time series is stationary
This indicates the given timeseries might be stationary as the mean and variance does not differ much. 1.5250316405842935e-05 For Istanbul : Time series is stationary
This indicates the given timeseries might be stationary as the mean and variance does not differ much. 0.0032061725213075 For Dhaka : Time series is stationary
This indicates the given timeseries might be stationary as the mean and variance does not differ much. 0.008033299810170369 For Rome : Time series is stationary
This indicates the given timeseries might be stationary as the mean and variance does not differ much. 0.013178894868493036 For Kano : Time series is stationary
This indicates the given timeseries might be stationary as the mean and variance does not differ much. 0.004583719058711593 For Baghdad : Time series is stationary
This indicates the given timeseries might be stationary as the mean and variance does not differ much. 0.04014004189014241 For Melbourne : Time series is stationary
This indicates the given timeseries might be stationary as the mean and variance does not differ much. 0.006403587394719924 For Madrid : Time series is stationary
This indicates the given timeseries might be stationary as the mean and variance does not differ much. 0.0013005645266157217 For London : Time series is stationary
This indicates the given timeseries might be stationary as the mean and variance does not differ much. 0.00012864866064311878 For Berlin : Time series is stationary
This indicates the given timeseries might be stationary as the mean and variance does not differ much. 0.037503886308115346 For Taiyuan : Time series is stationary
This indicates the given timeseries might be stationary as the mean and variance does not differ much. 0.02604889071579802 For Bangalore : Time series is stationary
# Plotting ACF, PACF curves of the data
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.graphics.tsaplots import plot_acf
plot_acf(df_majorcities['AverageTemperature'].diff().dropna())
pp.show()
plot_pacf(df_majorcities['AverageTemperature'].diff().dropna())
pp.show()
After going through the above Autocorrelation and Partial Autocorrelation curves, we conclude the best p and q values for our curve would be 2, 3 respectively.
# Applying ARIMA model on the data
from sklearn.metrics import accuracy_score
def apply_arima_model(data):
import warnings
from pmdarima import auto_arima
warnings.filterwarnings("ignore")
shape = data.shape[0]
# dividing into test and train
train=data.iloc[:(int(0.7*shape))]
test=data.iloc[-(int(0.3*shape)):]
# building the model order = [p,d,q]
from statsmodels.tsa.arima.model import ARIMA
model=ARIMA(train['AverageTemperature'],order=(2,0,3))
model=model.fit()
print(model.summary())
start = 0
end = len(train)+len(test)-1
pred = model.predict(start=start, end=len(train)+len(test)-1)
pp.plot(data["Date"][:100], data['AverageTemperature'][:100], label="Original Values")
pp.plot(data["Date"][start:end+1][:100], pred[:100], label="Predicted Values" )
pp.legend(loc="upper left")
pp.show()
return model
# Checking stationarity of the Top 20 cities.
checkStationarity(df_majorcities)
Given timeseries might not be stationary. 0.04555450424239894
'Time series is stationary'
# Applying ARIMA model on 6 cities from the Top 20 cities.
df_6majorcities_query = '''Select Date, MajorCities.CityName, Country.Country, AverageTemperature,
AverageTemperatureUncertainity
FROM GlobalLandTemperatureByMajorCity
INNER JOIN MajorCities ON GlobalLandTemperatureByMajorCity .CityID = MajorCities.CityID
INNER JOIN Country ON GlobalLandTemperatureByMajorCity.CountryID=Country.CountryID
WHERE MajorCities.CityName in ('Bangkok', 'Paris', 'Montreal', 'Moscow', 'Kiev', 'Toronto')
GROUP BY Date, MajorCities.CityName, Country.Country
ORDER BY MajorCities.CityName, Country.Country'''
df_6majorcities = pd.read_sql_query(df_majorcities_query, conn)
checkStationarity(df_6majorcities)
apply_arima_model(df_6majorcities)
Given timeseries might not be stationary.
0.04555450424239894
SARIMAX Results
==============================================================================
Dep. Variable: AverageTemperature No. Observations: 21007
Model: ARIMA(2, 0, 3) Log Likelihood -51986.519
Date: Sun, 18 Dec 2022 AIC 103987.038
Time: 21:14:34 BIC 104042.706
Sample: 0 HQIC 104005.207
- 21007
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
const 16.2455 0.210 77.215 0.000 15.833 16.658
ar.L1 1.4324 0.011 127.257 0.000 1.410 1.454
ar.L2 -0.6003 0.010 -59.154 0.000 -0.620 -0.580
ma.L1 0.0232 0.012 1.932 0.053 -0.000 0.047
ma.L2 0.3318 0.008 43.377 0.000 0.317 0.347
ma.L3 0.1702 0.008 20.113 0.000 0.154 0.187
sigma2 8.2598 0.046 178.118 0.000 8.169 8.351
===================================================================================
Ljung-Box (L1) (Q): 2.01 Jarque-Bera (JB): 25598.39
Prob(Q): 0.16 Prob(JB): 0.00
Heteroskedasticity (H): 1.81 Skew: -0.46
Prob(H) (two-sided): 0.00 Kurtosis: 8.33
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
<statsmodels.tsa.arima.model.ARIMAResultsWrapper at 0x2c8d70100>
# Predicting Future Temperature for 'Rome' from the Top 20 cities.
df_city_query = '''Select Date, MajorCities.CityName, Country.Country, AverageTemperature,
AverageTemperatureUncertainity
FROM GlobalLandTemperatureByMajorCity
INNER JOIN MajorCities ON GlobalLandTemperatureByMajorCity .CityID = MajorCities.CityID
INNER JOIN Country ON GlobalLandTemperatureByMajorCity.CountryID=Country.CountryID
WHERE MajorCities.CityName in ('Rome')
GROUP BY Date, MajorCities.CityName, Country.Country
ORDER BY MajorCities.CityName, Country.Country'''
df_city = pd.read_sql_query(df_city_query, conn)
checkStationarity(df_city)
model = apply_arima_model(df_city)
from pandas.tseries.offsets import DateOffset
from datetime import timedelta
from dateutil.relativedelta import relativedelta
future_dates=[datetime.strptime(df_city.index[-1], '%Y-%m-%d')+ relativedelta(months = x) for x in range(0,120)]
future_datest_df=pd.DataFrame(index=future_dates[1:],columns=df_city.columns)
future_datest_df.tail()
future_df=pd.concat([df_city,future_datest_df])
future_df['forecast'] = model.predict(start = len(df_city)-1, end = len(df_city)+120, dynamic= True)
future_df[['AverageTemperature', 'forecast']].plot(figsize=(30, 10), title = 'Average Temperature from 1900 to 2013 and forecasted future temperature from 2013 to 2023')
This indicates the given timeseries might be stationary as the mean and variance does not differ much.
0.008033299810170369
SARIMAX Results
==============================================================================
Dep. Variable: AverageTemperature No. Observations: 954
Model: ARIMA(2, 0, 3) Log Likelihood -1655.361
Date: Sun, 18 Dec 2022 AIC 3324.722
Time: 21:14:47 BIC 3358.747
Sample: 01-01-1900 HQIC 3337.684
- 06-01-1979
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
const 12.0603 0.061 197.396 0.000 11.941 12.180
ar.L1 1.7318 0.000 7609.422 0.000 1.731 1.732
ar.L2 -0.9998 0.000 -5373.286 0.000 -1.000 -0.999
ma.L1 -1.4428 0.035 -40.691 0.000 -1.512 -1.373
ma.L2 0.5405 0.060 8.952 0.000 0.422 0.659
ma.L3 0.2424 0.036 6.744 0.000 0.172 0.313
sigma2 1.9655 0.093 21.106 0.000 1.783 2.148
===================================================================================
Ljung-Box (L1) (Q): 1.30 Jarque-Bera (JB): 7.38
Prob(Q): 0.25 Prob(JB): 0.03
Heteroskedasticity (H): 0.95 Skew: -0.18
Prob(H) (two-sided): 0.62 Kurtosis: 3.23
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
<AxesSubplot: title={'center': 'Average Temperature from 1900 to 2013 and forecasted future temperature from 2013 to 2023'}>
# Predicting Future Temperature for 'Paris' in Top 20 cities.
df_city_query = '''Select Date, MajorCities.CityName, Country.Country, AverageTemperature,
AverageTemperatureUncertainity
FROM GlobalLandTemperatureByMajorCity
INNER JOIN MajorCities ON GlobalLandTemperatureByMajorCity .CityID = MajorCities.CityID
INNER JOIN Country ON GlobalLandTemperatureByMajorCity.CountryID=Country.CountryID
WHERE MajorCities.CityName in ('Paris')
GROUP BY Date, MajorCities.CityName, Country.Country
ORDER BY MajorCities.CityName, Country.Country'''
df_city = pd.read_sql_query(df_city_query, conn)
checkStationarity(df_city)
model = apply_arima_model(df_city)
from pandas.tseries.offsets import DateOffset
from datetime import timedelta
from dateutil.relativedelta import relativedelta
future_dates=[datetime.strptime(df_city.index[-1], '%Y-%m-%d')+ relativedelta(months = x) for x in range(0,120)]
future_datest_df=pd.DataFrame(index=future_dates[1:],columns=df_city.columns)
future_datest_df.tail()
future_df=pd.concat([df_city,future_datest_df])
future_df['forecast'] = model.predict(start = len(df_city)-1, end = len(df_city)+120, dynamic= True)
future_df[['AverageTemperature', 'forecast']].plot(figsize=(30, 10), title = 'Average Temperature from 1900 to 2013 and forecasted future temperature from 2013 to 2023')
This indicates the given timeseries might be stationary as the mean and variance does not differ much.
0.001579499393533814
SARIMAX Results
==============================================================================
Dep. Variable: AverageTemperature No. Observations: 954
Model: ARIMA(2, 0, 3) Log Likelihood -1810.529
Date: Sun, 18 Dec 2022 AIC 3635.058
Time: 21:14:57 BIC 3669.083
Sample: 01-01-1900 HQIC 3648.020
- 06-01-1979
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
const 10.4694 0.064 163.481 0.000 10.344 10.595
ar.L1 1.7320 0.000 1.1e+04 0.000 1.732 1.732
ar.L2 -1.0000 8.51e-05 -1.17e+04 0.000 -1.000 -1.000
ma.L1 -1.5400 0.034 -45.212 0.000 -1.607 -1.473
ma.L2 0.6439 0.054 11.889 0.000 0.538 0.750
ma.L3 0.2036 0.029 6.907 0.000 0.146 0.261
sigma2 2.5630 0.110 23.237 0.000 2.347 2.779
===================================================================================
Ljung-Box (L1) (Q): 0.12 Jarque-Bera (JB): 15.24
Prob(Q): 0.73 Prob(JB): 0.00
Heteroskedasticity (H): 1.09 Skew: -0.12
Prob(H) (two-sided): 0.44 Kurtosis: 3.57
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
<AxesSubplot: title={'center': 'Average Temperature from 1900 to 2013 and forecasted future temperature from 2013 to 2023'}>
# Predicting Future Temperature for 'Tokyo' in Top 20 cities.
df_city_query = '''Select Date, MajorCities.CityName, Country.Country, AverageTemperature,
AverageTemperatureUncertainity
FROM GlobalLandTemperatureByMajorCity
INNER JOIN MajorCities ON GlobalLandTemperatureByMajorCity .CityID = MajorCities.CityID
INNER JOIN Country ON GlobalLandTemperatureByMajorCity.CountryID=Country.CountryID
WHERE MajorCities.CityName in ('Tokyo')
GROUP BY Date, MajorCities.CityName, Country.Country
ORDER BY MajorCities.CityName, Country.Country'''
df_city = pd.read_sql_query(df_city_query, conn)
checkStationarity(df_city)
model = apply_arima_model(df_city)
from pandas.tseries.offsets import DateOffset
from datetime import timedelta
from dateutil.relativedelta import relativedelta
future_dates=[datetime.strptime(df_city.index[-1], '%Y-%m-%d')+ relativedelta(months = x) for x in range(0,120)]
future_datest_df=pd.DataFrame(index=future_dates[1:],columns=df_city.columns)
future_datest_df.tail()
future_df=pd.concat([df_city,future_datest_df])
future_df['forecast'] = model.predict(start = len(df_city)-1, end = len(df_city)+120, dynamic= True)
future_df[['AverageTemperature', 'forecast']].plot(figsize=(30, 10), title = 'Average Temperature from 1900 to 2013 and forecasted future temperature from 2013 to 2023')
This indicates the given timeseries might be stationary as the mean and variance does not differ much.
0.003762737503139213
SARIMAX Results
==============================================================================
Dep. Variable: AverageTemperature No. Observations: 954
Model: ARIMA(2, 0, 3) Log Likelihood -1566.592
Date: Sun, 18 Dec 2022 AIC 3147.184
Time: 21:15:05 BIC 3181.209
Sample: 01-01-1900 HQIC 3160.145
- 06-01-1979
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
const 12.5521 0.049 258.796 0.000 12.457 12.647
ar.L1 1.7320 0.000 7078.537 0.000 1.732 1.732
ar.L2 -0.9999 0.000 -9286.343 0.000 -1.000 -1.000
ma.L1 -1.4023 0.032 -43.268 0.000 -1.466 -1.339
ma.L2 0.3561 0.053 6.746 0.000 0.253 0.460
ma.L3 0.3638 0.032 11.499 0.000 0.302 0.426
sigma2 1.5322 0.077 19.907 0.000 1.381 1.683
===================================================================================
Ljung-Box (L1) (Q): 0.23 Jarque-Bera (JB): 6.97
Prob(Q): 0.63 Prob(JB): 0.03
Heteroskedasticity (H): 0.96 Skew: 0.19
Prob(H) (two-sided): 0.73 Kurtosis: 2.82
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
<AxesSubplot: title={'center': 'Average Temperature from 1900 to 2013 and forecasted future temperature from 2013 to 2023'}>
# Predicting Future Temperature for 'London' in Top 20 cities.
df_city_query = '''Select Date, MajorCities.CityName, Country.Country, AverageTemperature,
AverageTemperatureUncertainity
FROM GlobalLandTemperatureByMajorCity
INNER JOIN MajorCities ON GlobalLandTemperatureByMajorCity .CityID = MajorCities.CityID
INNER JOIN Country ON GlobalLandTemperatureByMajorCity.CountryID=Country.CountryID
WHERE MajorCities.CityName in ('London')
GROUP BY Date, MajorCities.CityName, Country.Country
ORDER BY MajorCities.CityName, Country.Country'''
df_city = pd.read_sql_query(df_city_query, conn)
checkStationarity(df_city)
model = apply_arima_model(df_city)
from pandas.tseries.offsets import DateOffset
from datetime import timedelta
from dateutil.relativedelta import relativedelta
future_dates=[datetime.strptime(df_city.index[-1], '%Y-%m-%d')+ relativedelta(months = x) for x in range(0,120)]
future_datest_df=pd.DataFrame(index=future_dates[1:],columns=df_city.columns)
future_datest_df.tail()
future_df=pd.concat([df_city,future_datest_df])
future_df['forecast'] = model.predict(start = len(df_city)-1, end = len(df_city)+120, dynamic= True)
future_df[['AverageTemperature', 'forecast']].plot(figsize=(30, 10), title = 'Average Temperature from 1900 to 2013 and forecasted future temperature from 2013 to 2023')
This indicates the given timeseries might be stationary as the mean and variance does not differ much.
0.0013005645266157217
SARIMAX Results
==============================================================================
Dep. Variable: AverageTemperature No. Observations: 954
Model: ARIMA(2, 0, 3) Log Likelihood -1659.277
Date: Sun, 18 Dec 2022 AIC 3332.554
Time: 21:15:13 BIC 3366.578
Sample: 01-01-1900 HQIC 3345.515
- 06-01-1979
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
const 9.5515 0.058 165.630 0.000 9.438 9.664
ar.L1 1.7319 0.000 9425.709 0.000 1.732 1.732
ar.L2 -0.9999 0.000 -9907.171 0.000 -1.000 -1.000
ma.L1 -1.4479 0.030 -48.629 0.000 -1.506 -1.389
ma.L2 0.4847 0.052 9.370 0.000 0.383 0.586
ma.L3 0.2913 0.030 9.867 0.000 0.233 0.349
sigma2 1.8758 0.081 23.064 0.000 1.716 2.035
===================================================================================
Ljung-Box (L1) (Q): 0.54 Jarque-Bera (JB): 3.56
Prob(Q): 0.46 Prob(JB): 0.17
Heteroskedasticity (H): 0.90 Skew: -0.06
Prob(H) (two-sided): 0.35 Kurtosis: 3.27
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
<AxesSubplot: title={'center': 'Average Temperature from 1900 to 2013 and forecasted future temperature from 2013 to 2023'}>
# Predicting Future Temperature for 'Toronto' in Top 20 cities.
df_city_query = '''Select Date, MajorCities.CityName, Country.Country, AverageTemperature,
AverageTemperatureUncertainity
FROM GlobalLandTemperatureByMajorCity
INNER JOIN MajorCities ON GlobalLandTemperatureByMajorCity .CityID = MajorCities.CityID
INNER JOIN Country ON GlobalLandTemperatureByMajorCity.CountryID=Country.CountryID
WHERE MajorCities.CityName in ('Toronto')
GROUP BY Date, MajorCities.CityName, Country.Country
ORDER BY MajorCities.CityName, Country.Country'''
df_city = pd.read_sql_query(df_city_query, conn)
checkStationarity(df_city)
model = apply_arima_model(df_city)
from pandas.tseries.offsets import DateOffset
from datetime import timedelta
from dateutil.relativedelta import relativedelta
future_dates=[datetime.strptime(df_city.index[-1], '%Y-%m-%d')+ relativedelta(months = x) for x in range(0,120)]
future_datest_df=pd.DataFrame(index=future_dates[1:],columns=df_city.columns)
future_datest_df.tail()
future_df=pd.concat([df_city,future_datest_df])
future_df['forecast'] = model.predict(start = len(df_city)-1, end = len(df_city)+120, dynamic= True)
future_df[['AverageTemperature', 'forecast']].plot(figsize=(30, 10), title = 'Average Temperature from 1900 to 2013 and forecasted future temperature from 2013 to 2023')
This indicates the given timeseries might be stationary as the mean and variance does not differ much.
0.00048465407590177266
SARIMAX Results
==============================================================================
Dep. Variable: AverageTemperature No. Observations: 955
Model: ARIMA(2, 0, 3) Log Likelihood -1992.964
Date: Sun, 18 Dec 2022 AIC 3999.927
Time: 21:15:25 BIC 4033.959
Sample: 01-01-1900 HQIC 4012.891
- 07-01-1979
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
const 6.0200 0.086 69.731 0.000 5.851 6.189
ar.L1 1.7321 5.19e-05 3.33e+04 0.000 1.732 1.732
ar.L2 -1.0000 1.82e-05 -5.49e+04 0.000 -1.000 -1.000
ma.L1 -1.5071 0.044 -33.929 0.000 -1.594 -1.420
ma.L2 0.6220 0.060 10.430 0.000 0.505 0.739
ma.L3 0.2185 0.035 6.177 0.000 0.149 0.288
sigma2 3.7940 0.217 17.453 0.000 3.368 4.220
===================================================================================
Ljung-Box (L1) (Q): 0.01 Jarque-Bera (JB): 32.03
Prob(Q): 0.91 Prob(JB): 0.00
Heteroskedasticity (H): 0.85 Skew: 0.02
Prob(H) (two-sided): 0.16 Kurtosis: 3.90
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
<AxesSubplot: title={'center': 'Average Temperature from 1900 to 2013 and forecasted future temperature from 2013 to 2023'}>
From the 'original vs predicted' graph the predicted values are closer to original values. Using this model we forecasted the temperatures for the next few years for five cities 'Rome', 'Paris', 'Tokyo', 'London', 'Toronto' and got an inference about how the temperatures would vary in those cities.
Although our model was not able to predict the extreme values, we could still see the minimum temperatures increasing by a little bit, which shows as expected that Global warming is leading to increase in the minimum temperatures.